{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- Author of code: Stiene Praet \n",
    "- Date: 27/11/2019\n",
    "- Purpose: calculate the entropy and ideology per page and per user for the US2016_FB_likes paper\n",
    "- Data IN:  2016 user likes data and user survey data. Loaded as csv files into pandas dataframes\n",
    "- Data OUT: Excel files with the results per page and per user \n",
    "- Machine: local"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import math\n",
    "%matplotlib inline\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\stien\\anaconda3\\lib\\site-packages\\numpy\\lib\\arraysetops.py:583: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
      "  mask |= (ar1 == a)\n"
     ]
    }
   ],
   "source": [
    "# load user likes data\n",
    "user_likes = pd.read_csv('../data/user_likes.csv', encoding='UTF-8',  index_col=0) # load likes data\n",
    "#converters={'resp_id':str,'page_id':str},"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load survey data \n",
    "user_ideo = pd.read_csv('../data/survey_data_clean.csv', encoding='UTF-8') # load user survey data\n",
    "#,converters={'resp_id':str}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 0. Like distribution\n",
    "Figure 1: Page “likes” distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#count number of likes per page\n",
    "page_count = pd.DataFrame(user_likes.groupby(['page_name']).count()['resp_id']) # group by page name and count number of users\n",
    "page_count.reset_index(inplace=True) # reset index\n",
    "page_count.rename(columns={'resp_id':'page_count'}, inplace=True) # rename column\n",
    "page_count = page_count.merge(user_likes[['page_name','page_id']], how='left',on='page_name') # merge with page_ids\n",
    "page_count.drop_duplicates(subset='page_name', inplace=True) # drop duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\stien\\anaconda3\\lib\\site-packages\\seaborn\\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).\n",
      "  warnings.warn(msg, FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(0.0, 0.4)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEKCAYAAAA4t9PUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAdlUlEQVR4nO3de5SdVZ3m8e+TSopUVbiMJNwSaCKmibEXUboIOrhUsMGEdjrgNSjQ3jpEiTY6OB2clrHbmbVMt+3SnhUNmXQUbTGDF+io0WCjI854IYXQQJBoDMykDEoC5EpIqOQ3f7xvWW9OTlWdyjm7Tuq8z2etWue8l33OrneRetj7fffeigjMzMyO1rhmV8DMzMY2B4mZmdXFQWJmZnVxkJiZWV0cJGZmVhcHiZmZ1SVpkEiaK2mjpE2Slgxx3gWSDkp600jLmplZcyULEkltwDJgHjALuErSrEHOWwqsG2lZMzNrvpQtkjnApojYHBEHgNXA/CrnvR/4OvDkUZQ1M7MmG5/ws6cCWwrbvcCFxRMkTQWuBC4BLhhJ2cJnLAQWAnR1df3xzJkz6664mVlZ3HfffdsjYko9n5EySFRlX+V8LJ8G/ioiDkqHnV5L2WxnxApgBUB3d3f09PSwYwf87ndw7rkjrrOZWalI+r/1fkbKIOkFzixsTwO2VpzTDazOQ2QycLmkvhrLDurOO2HtWrj99qOotZmZjUjKIFkPzJA0HfgNsAB4W/GEiJje/17SF4BvRcSdksYPV3You3fDjh1119/MzGqQLEgiok/SYrKnsdqAVRGxQdKi/PjykZat9bt374adO+urv5mZ1SZli4SIWAusrdhXNUAi4h3Dla3V7t3Zj5mZpdeSI9t37XKQmJmNFgeJmZnVpSWDZPdu2LMHvPijmVl6LRskBw/Cc881uyZmZq2vJYNkz57sddeu5tbDzKwMHCRmZlaXlg2S4493kJiZjYaWDZIpU/zklpnZaGjJINm7FyZPdovEzGw0tFyQ9PVlPyed5CAxMxsNLRcku3dDZ2f24yAxM0uv5YJkzx7o6oKODt8jMTMbDS0XJMUWiWcANjNLr+WCZM+eLEQ6OhwkZmajoeWCZPdumDgx695ykJiZpddyQdLfInHXlpnZ6Gi5INm9O+vW6uryzXYzs9HQckGyZ0/WtdXR4cd/zcxGQ9IgkTRX0kZJmyQtqXJ8vqQHJT0gqUfSKwvHHpf0UP+xWr/TLRIzs9GVbM12SW3AMuBSoBdYL2lNRDxSOO1uYE1EhKTzgNuBmYXjF0fE9pF8b//N9s5OB4mZ2WhI2SKZA2yKiM0RcQBYDcwvnhAReyJ+v45hF1D3moa7dg20SPqnkzczs3RSBslUYEthuzffdxhJV0p6FPg28K7CoQDuknSfpIW1fml/kEycCPv2ZSslmplZOimDRFX2HdHiiIg7ImImcAXw8cKhiyLifGAecL2kV1X9Emlhfn+lZ9u2bb8f2T5uXBYobpWYmaWVMkh6gTML29OArYOdHBH3AOdImpxvb81fnwTuIOsqq1ZuRUR0R0T3lClTfn+zHWDSJD+5ZWaWWsogWQ/MkDRdUjuwAFhTPEHSiyQpf38+0A48JalL0vH5/i7gMuDhWr60f0Ai+MktM7PRkOyprYjok7QYWAe0AasiYoOkRfnx5cAbgWslPQ/sA96aP8F1KnBHnjHjgdsi4ru1fG+xReKp5M3M0ksWJAARsRZYW7FveeH9UmBplXKbgdlH85179w4ESVeXg8TMLLWWHNnuFomZ2ehpuSDZu3fgHokXtzIzS6/lguTZZ7MxJOD5tszMRkNLBcmhQ3DccdDWlm07SMzM0mupIDl4MLvB3q+zE3bsaFp1zMxKoaWC5NChgRvt4Ke2zMxGQ0sFSWWLxOu2m5ml11JB4haJmdnoa6kgOXhw4NFf8DgSM7PR0FJBcujQwKO/4Lm2zMxGQ0sFycGDh3dteZVEM7P0Wi5Iii0SB4mZWXotFSTVura8sJWZWVotFSSVXVsTJkAE7N/fvDqZmbW6lg4SyTfczcxSa6kgOXTo8Md/wWNJzMxSa6kgqWyRgIPEzCy1lguSai0Sd22ZmaXTUkFSOUUKeCp5M7PUkgaJpLmSNkraJGlJlePzJT0o6QFJPZJeWWvZaqrdI/E0KWZmaSULEkltwDJgHjALuErSrIrT7gZmR8RLgXcBK0dQ9gjV7pE4SMzM0krZIpkDbIqIzRFxAFgNzC+eEBF7IiLyzS4gai1bzWBdW75HYmaWTsogmQpsKWz35vsOI+lKSY8C3yZrldRcNi+/MO8W6zl4cFvVIPGaJGZm6aQMElXZF0fsiLgjImYCVwAfH0nZvPyKiOiOiG5pCu3thx/v7HSQmJmllDJIeoEzC9vTgK2DnRwR9wDnSJo80rL9xlX5bRwkZmZppQyS9cAMSdMltQMLgDXFEyS9SJLy9+cD7cBTtZStplqQeECimVla41N9cET0SVoMrAPagFURsUHSovz4cuCNwLWSngf2AW/Nb75XLTvcdw7WInGQmJmlkyxIACJiLbC2Yt/ywvulwNJayw7HQWJmNvpaamT7YF1bfvzXzCydlgoSVXnWy6skmpml1VJBMljXlldJNDNLpzRBElVHoZiZWb1aPkja2uC442Dv3tGvj5lZGbRUkFS7RwIwaZLvk5iZpdJSQVKtRQJ+BNjMLKVSBIlHt5uZpVOKIHGLxMwsnVIEiVskZmbplCJIvLiVmVk6LRUkgz211dHhFomZWSotFSRtbdX3+x6JmVk6LRUkQ3VteXErM7M0WipIBuvacovEzCydlgqSoR7/dYvEzCyNlgqSwXR1OUjMzFIpRZC4a8vMLJ2kQSJprqSNkjZJWlLl+NslPZj//FjS7MKxxyU9JOkBST311MOLW5mZpZNszXZJbcAy4FKgF1gvaU1EPFI47THg1RHxjKR5wArgwsLxiyNie711cZCYmaWTskUyB9gUEZsj4gCwGphfPCEifhwRz+SbPwWmpaiI1203M0snZZBMBbYUtnvzfYN5N/CdwnYAd0m6T9LCwQpJWiipR1LPzp3bqp7jFomZWTrJuraAaqM6qi54K+lisiB5ZWH3RRGxVdIpwPckPRoR9xzxgREryLrEOPfc7qqfP3EiHDgAfX0wPuVvbGZWQilbJL3AmYXtacDWypMknQesBOZHxFP9+yNia/76JHAHWVfZUZHcvWVmlkrKIFkPzJA0XVI7sABYUzxB0lnAN4BrIuKXhf1dko7vfw9cBjxcT2UmTfIjwGZmKSTr6ImIPkmLgXVAG7AqIjZIWpQfXw7cDJwMfFbZ/CZ9EdENnArcke8bD9wWEd+tpz5ek8TMLI2kdwwiYi2wtmLf8sL79wDvqVJuMzC7cn89fMPdzCyNUoxsB49uNzNLxUFiZmZ1cZCYmVldShMkXrfdzCyNUgWJWyRmZo1XU5BI+rqkP5U0ZoOnsxN27Gh2LczMWk+twfA54G3AryR9QtLMhHVKwqskmpmlUVOQRMS/RsTbgfOBx8nmvvqxpHdKmpCygo3icSRmZmnU3FUl6WTgHWQDCO8HPkMWLN9LUrMGc4vEzCyNmka2S/oGMBP4EvAfIuKJ/ND/rHf1wtHiKVLMzNKodYqUlfl0J78n6biI2J/PjXXMc9eWmVkatXZt/dcq+37SyIqk5iAxM0tjyBaJpNPIVjXskPQyBharOgHoTFy3hvJ6JGZmaQzXtfU6shvs04BPFfbvBj6SqE5JdHTAnj0QkS10ZWZmjTFkkETErcCtkt4YEV8fpTol0d4O48bBc89loWJmZo0xXNfW1RHxz8DZkj5UeTwiPlWl2DFr0qSse8tBYmbWOMN1bXXlr5NSV2Q09D8CfMopza6JmVnrGK5r65b89W9GpzppeSyJmVnj1Tpp499JOkHSBEl3S9ou6eoays2VtFHSJklLqhx/u6QH858fS5pda9mj4TVJzMwar9ZxJJdFxC7g9UAv8IfAh4cqIKkNWAbMA2YBV0maVXHaY8CrI+I84OPAihGUHTE/Amxm1ni1Bkn/xIyXA1+JiKdrKDMH2BQRmyPiALAamF88ISJ+HBHP5Js/JXvMuKayR8NrkpiZNV6tQfJNSY8C3cDdkqYAzw1TZiqwpbDdm+8bzLuB74y0rKSFknok9ezcuW3ICjlIzMwar9Zp5JcArwC6I+J5YC/DtxCqDfuLqidKF5MFyV+NtGxErIiI7ojoPvHEKUNWyEFiZtZ4tU7aCPBisvEkxTJfHOL8XuDMwvY0YGvlSZLOA1YC8yLiqZGUHSnfbDcza7xap5H/EnAO8ABwMN8dDB0k64EZkqYDvwEWkK2yWPzcs4BvANdExC9HUvZodHR4TRIzs0artUXSDcyKiKrdS9VERJ+kxcA6oA1YFREbJC3Kjy8HbgZOBj6rbAKsvrybqmrZmn+rQXR1wda62zVmZlZUa5A8DJwGPDHciUX5GiZrK/YtL7x/D9mKizWVrZdXSTQza7xag2Qy8Iike4H9/Tsj4s+S1CoRjyMxM2u8WoPkYykrMVp8s93MrPFqCpKI+KGkPwBmRMS/Suoku3cxpniVRDOzxqt1rq2/AL4G3JLvmgrcmahOybhry8ys8Wod2X49cBGwCyAifgWMucnY3SIxM2u8WoNkfz7nFQD5oMSaHwU+VnR0wL59cOhQs2tiZtY6ag2SH0r6CNAh6VLgq8A301UrjXHjBtZuNzOzxqg1SJYA24CHgOvIxnf8dapKpeTFrczMGqvWp7YOSboTuDMihp5i9xg3aZKDxMyskYZskSjzMUnbgUeBjZK2Sbp5dKrXeB0dvuFuZtZIw3Vt3UD2tNYFEXFyRLwAuBC4SNIHU1cuBXdtmZk11nBBci1wVUQ81r8jIjYDV+fHxhyPbjcza6zhgmRCRGyv3JnfJ5lQ5fxjnoPEzKyxhguSA0d57Jg1caLvkZiZNdJwT23NllTt/98FTExQn+TcIjEza6whgyQixtzEjMPxmiRmZo1V64DEltHZCTt2NLsWZmato5RB4nskZmaNkzRIJM2VtFHSJklLqhyfKeknkvZLurHi2OOSHpL0gKSeRtXJXVtmZo1V6wqJIyapDVgGXAr0AuslrYmIRwqnPQ18ALhikI+5uNrjx/XwgEQzs8ZK2SKZA2yKiM35FPSrgfnFEyLiyYhYDzyfsB6H8VNbZmaNlTJIpgJbCtu9+b5aBXCXpPskLRzsJEkLJfVI6tm5c/j5JDs7PY28mVkjpQwSVdk3ksWwLoqI84F5wPWSXlXtpIhYERHdEdF94olThv1Q32w3M2uslEHSC5xZ2J4GbK21cERszV+fBO4g6yqrW1eXWyRmZo2UMkjWAzMkTZfUDiwA1tRSUFKXpOP73wOXAQ83olLt7XDwIBwYkxO8mJkde5I9tRURfZIWA+uANmBVRGyQtCg/vlzSaUAPcAJwSNINwCxgMnCHpP463hYR321EvaSsVbJ7N5x8ciM+0cys3JIFCUBErCVblre4b3nh/W/Jurwq7QJmp6pX/yPADhIzs/qVbmQ7eCyJmVkjOUjMzKwupQwSPwJsZtY4pQ0St0jMzBqjlEHS0eEgMTNrFAeJmZnVxUFiZmZ1KWWQeE0SM7PGKWWQdHU5SMzMGqWUQeIWiZlZ45Q2SDyOxMysMUobJL7ZbmbWGKUMkv7Zf83MrH6lDBK3SMzMGqe0QbJ3b7NrYWbWGkobJLt3Q4xkBXkzM6uqlEEyfny25O6zzza7JmZmY18pgwRg0iTfJzEza4SkQSJprqSNkjZJWlLl+ExJP5G0X9KNIylbL48lMTNrjGRBIqkNWAbMA2YBV0maVXHa08AHgE8eRdm6eJVEM7PGSNkimQNsiojNEXEAWA3ML54QEU9GxHrg+ZGWrZeDxMysMVIGyVRgS2G7N9/X0LKSFkrqkdSzc+e2mivnsSRmZo2RMkhUZV+tD9zWXDYiVkREd0R0n3jilJor19HheyRmZo2QMkh6gTML29OAraNQtiZe3MrMrDFSBsl6YIak6ZLagQXAmlEoWxN3bZmZNcb4VB8cEX2SFgPrgDZgVURskLQoP75c0mlAD3ACcEjSDcCsiNhVrWwj69fR4TVJzMwaIVmQAETEWmBtxb7lhfe/Jeu2qqlsI7lFYmbWGKUd2e5VEs3MGqO0QeJ1283MGqO0QeKuLTOzxnCQmJlZXUobJF1dsGdPs2thZjb2lTZIPPuvmVljlDZIurocJGZmjVDaIJk4Efbvh76+ZtfEzGxsK22QSFn3lu+TmJnVp7RBAl5u18ysEUodJF7cysysfqUOEo8lMTOrX+mDxE9umZnVp9RB4q4tM7P6lTpIvEqimVn9HCQOEjOzupQ+SHyPxMysPqUOks5O2LGj2bUwMxvbkgaJpLmSNkraJGlJleOS9I/58QclnV849rikhyQ9IKknRf28SqKZWf2SrdkuqQ1YBlwK9ALrJa2JiEcKp80DZuQ/FwKfy1/7XRwR21PVsbMTtmxJ9elmZuWQskUyB9gUEZsj4gCwGphfcc584IuR+SlwkqTTE9bpMH7818ysfimDZCpQ/P/93nxfrecEcJek+yQtTFFBj2w3M6tfsq4tQFX2xQjOuSgitko6BfiepEcj4p4jviQLmYUAp5561ogq6DVJzMzql7JF0gucWdieBmyt9ZyI6H99EriDrKvsCBGxIiK6I6L7xBOnjKiCHkdiZla/lEGyHpghabqkdmABsKbinDXAtfnTWy8HdkbEE5K6JB0PIKkLuAx4uNEV9LrtZmb1S9a1FRF9khYD64A2YFVEbJC0KD++HFgLXA5sAp4F3pkXPxW4Q1J/HW+LiO82uo79C1tFZAtdmZnZyKW8R0JErCULi+K+5YX3AVxfpdxmYHbKugG0t2cBsn9/tvSumZmNXKlHtoOf3DIzq1fpg2TSJD+5ZWZWj9IHSVcX/Pa3za6FmdnYVfoged3r4H3vg+eea3ZNzMzGptIHyRVXwJQpcP0Rt/zNzKwWpQ8SCT70Ifj+9+Hzn292bczMxp7SBwlkT27dfDPceCP82781uzZmZmOLgyQ3fTosWgRveIPXKDEzGwkHScGll8J558E112Sj3c3MbHgOkgrvfS/8+tfwyU82uyZmZmODg6RCezt89KOwdCncc8Sk9WZmVslBUsVpp8GHPwxvfasHK5qZDcdBMogLL8wGK775zdDX1+zamJkduxwkQ7jmGjhwAD7ykWbXxMzs2OUgGUJbG9x0E3zpS7CmckkuMzMDHCTDOumk7Ob7u98Nmzc3uzZmZsceB0kNZs2Ct70NXvEKeOc74ctfhq2Vq8+bmZWUg6RGV16ZPRJ8wgmwciW85CUwYwZcdx189auwbVuza2hm1hyKFhrCfe653XHLLT2j8l2HDmUDF++/Hx56CB54AM46Cy65BF772ixozjgjW+/EzOxYJem+iOiu5zOSrtkuaS7wGaANWBkRn6g4rvz45cCzwDsi4ue1lG22ceOyFsmMGfCWt8DBg7BxYxYoS5dCb2/WSpkwAU49FU4/PQuWM86AqVOz19NPz46dcEK2UmNXV3a+mdlYkixIJLUBy4BLgV5gvaQ1EfFI4bR5wIz850Lgc8CFNZY9prS1ZfdSZs0a2BcBe/fC9u3w1FPw9NPZ+3vvhWeeybaffhqefXbgZ8KEbDbirq6BcOl/7eyE447LfiZOPPy18mf8+Np+2toGXos/1fa1tWUBKmWvxZ/iPunIHzNrXSlbJHOATRGxGUDSamA+UAyD+cAXI+tf+6mkkySdDpxdQ9kjbN2aTQc/VnR1Hd71FZGNW9m3LwuVXbuykfX79sH+/c2rp5nZUFIGyVRgS2G7l6zVMdw5U2ssC4CkhcDCfHP/j36kh+uocyuZDGxvdiWOAb4OA3wtBvhaDDi33g9IGSTVOjQq7+wPdk4tZbOdESuAFQCSeuq9adQqfC0yvg4DfC0G+FoMkFT3E0opg6QXOLOwPQ2oHH0x2DntNZQ1M7NjQMpxJOuBGZKmS2oHFgCVE42sAa5V5uXAzoh4osayZmZ2DEjWIomIPkmLgXVkj/CuiogNkhblx5cDa8ke/d1E9vjvO4cqW8PXrmj8bzJm+VpkfB0G+FoM8LUYUPe1aKkBiWZmNvo8RYqZmdXFQWJmZnVpiSCRNFfSRkmbJC1pdn1Gk6QzJf1A0i8kbZD0l/n+F0j6nqRf5a//rtl1HS2S2iTdL+lb+XYpr0U+wPdrkh7N//t4RYmvxQfzfx8PS/qKpIlluRaSVkl6UhoYYzfU7y7ppvxv6UZJr6vlO8Z8kBSmU5kHzAKukjRr6FItpQ/4jxHxYuDlwPX5778EuDsiZgB359tl8ZfALwrbZb0WnwG+GxEzgdlk16R010LSVOADQHdE/BHZAzwLKM+1+AIwt2Jf1d89/9uxAHhJXuaz+d/YIY35IKEwFUtEHAD6p1MphYh4on+iy4jYTfbHYirZNbg1P+1W4IqmVHCUSZoG/CmwsrC7dNdC0gnAq4B/AoiIAxGxgxJei9x4oEPSeKCTbFxaKa5FRNwDPF2xe7DffT6wOiL2R8RjZE/UzhnuO1ohSAabZqV0JJ0NvAz4GXBqPiaH/PWUJlZtNH0a+E/AocK+Ml6LFwLbgM/n3XwrJXVRwmsREb8BPgn8P+AJsvFqd1HCa1Ew2O9+VH9PWyFIap5OpZVJmgR8HbghInY1uz7NIOn1wJMRcV+z63IMGA+cD3wuIl4G7KV1u26GlPf/zwemA2cAXZKubm6tjllH9fe0FYKklqlYWpqkCWQh8uWI+Ea++3f5TMrkr082q36j6CLgzyQ9TtbFeYmkf6ac16IX6I2In+XbXyMLljJeiz8BHouIbRHxPPAN4N9TzmvRb7Df/aj+nrZCkJR6OpV8cbB/An4REZ8qHFoD/Hn+/s+Bfxntuo22iLgpIqZFxNlk/x18PyKuppzX4rfAFkn9M7u+lmwZhtJdC7IurZdL6sz/vbyW7F5iGa9Fv8F+9zXAAknHSZpOtlbUvcN9WEuMbJd0OVnfeP90Kv+tuTUaPZJeCfwIeIiB+wIfIbtPcjtwFtk/pDdHROUNt5Yl6TXAjRHxekknU8JrIemlZA8dtAObyaYgGkc5r8XfAG8le8rxfuA9wCRKcC0kfQV4DdnU+b8D/gtwJ4P87pL+M/Ausmt1Q0R8Z9jvaIUgMTOz5mmFri0zM2siB4mZmdXFQWJmZnVxkJiZWV0cJGZmVhcHibUUSSHpHwrbN0r6WIM++wuS3tSIzxrme96cz9b7g4r9Z/fP4CqpW9I/5u8/JunG1PUyG4yDxFrNfuANkiY3uyJFtcygWvBu4H0RcfFgJ0RET0R8oP6amdXPQWKtpo9sDeoPVh6obFFI2pO/vkbSDyXdLumXkj4h6e2S7pX0kKRzCh/zJ5J+lJ/3+rx8m6S/l7Re0oOSrit87g8k3UY2YLSyPlfln/+wpKX5vpuBVwLLJf39YL9k/tnfqrL/LyR9R1KHpKvz3+EBSbfk9WzLr8PD+XcfcZ3MRmp8sytglsAy4EFJfzeCMrOBF5NNt70ZWBkRc5QtFPZ+4Ib8vLOBVwPnAD+Q9CLgWrIZZS+QdBzwfyTdlZ8/B/ijfEru35N0BrAU+GPgGeAuSVdExN9KuoRsVH7PSH5pSYuBy8imBH8h2UjuiyLieUmfBd4ObACm5utyIOmkkXyHWTUOEms5EbFL0hfJFjPaV2Ox9f3Takv6NdAfBA8BxS6m2yPiEPArSZuBmWR/vM8rtHZOJJuj6ABwb2WI5C4A/ldEbMu/88tk64fcWWN9K11DNuHeFXlwvJYspNZn00vRQTYx3zeBF0r678C3C7+n2VFzkFir+jTwc+DzhX195N25+eR97YVj+wvvDxW2D3H4v5PKOYWCbOrt90fEuuKBfL6vvYPUr9p03fV4GHgp2Wytj+Wff2tE3HTEF0uzgdcB1wNvIZtXyeyo+R6JtaR8ArrbyW5c93uc7P/SIVufYsJRfPSbJY3L75u8ENgIrAPem0/nj6Q/zBeRGsrPgFdLmpzfiL8K+OFR1Kff/cB1wJq82+xu4E2STsnr9AJJf5A/hDAuIr4OfJRsanmzurhFYq3sH4DFhe3/AfyLpHvJ/tAO1loYykayP/inAosi4jlJK8nunfw8b+lsY5hlWyPiCUk3AT8gaz2sjYi6pjGPiP+dPwb8beBS4K/J7r2MA54na4HsI1s1sf9/Io9osZiNlGf/NTOzurhry8zM6uIgMTOzujhIzMysLg4SMzOri4PEzMzq4iAxM7O6OEjMzKwu/x/KOhYtcikSpwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure()\n",
    "sns.distplot(page_count['page_count'], hist=False, kde_kws = {'shade': True, 'linewidth': 1}, color='b', kde=True)\n",
    "plt.xlabel('Number of likes')\n",
    "plt.ylabel('Density')\n",
    "plt.xlim(0,100)\n",
    "plt.ylim(0,0.4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Calculate ideology and homogeneity per page"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## A. Ideology per page\n",
    "- The ideology per page is the average of the ideology scores of the users that liked the page \n",
    "- We only consider pages with minimum 30 likes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_ideo['ideo']= user_ideo['ideo5']\n",
    "user_ideo['ideo'].replace(6, np.nan, inplace=True) # replace value of 6 (not sure) with NaN\n",
    "user_ideo['ideo'] = user_ideo['ideo'] - 1 # recode starting from 0, i.e. 0= very liberal, 1= liberal, 2= moderate, 3= conservative, 4 = very conservative\n",
    "user_likes_ideo = user_likes.merge(user_ideo, on='resp_id') # add user ideo to user likes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pages with minimum x likes\n",
    "x=29\n",
    "pages_30 = page_count[page_count['page_count'] > x]['page_name'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#count number of likes per user\n",
    "user_count = pd.DataFrame(user_likes.groupby(['resp_id']).count()['page_name']) # group by response id and count number of pages\n",
    "user_count.reset_index(inplace=True) # reset index\n",
    "user_count.rename(columns={'page_name':'user_count'}, inplace=True) # rename column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# users with pages liked min x times\n",
    "\n",
    "users30= set(user_likes[user_likes['page_name'].isin(pages_30)].resp_id.tolist())\n",
    "user_ideo30=user_ideo[user_ideo['resp_id'].isin(users30)]\n",
    "user_ideo30=user_ideo30.merge(user_count, on='resp_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate ideology for all pages with minimum x likes\n",
    "user_likes_ideo_30 = user_likes_ideo[(user_likes_ideo['page_name'].isin(pages_30))&(user_likes_ideo['resp_id'].isin(users30))]\n",
    "page_ideo = pd.DataFrame(user_likes_ideo_30.groupby('page_name')['ideo'].mean())\n",
    "page_ideo.rename(columns={'ideo':'page_ideo'}, inplace=True)\n",
    "page_ideo['page_ideo'] = page_ideo['page_ideo']/4\n",
    "page_ideo.reset_index(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge page_ideo with page_count\n",
    "pages_info = page_count[page_count['page_count'] > x].merge(page_ideo[['page_name','page_ideo']], on='page_name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate correction factor\n",
    "corr = ((user_ideo30.ideo * user_ideo30.user_count).sum() / user_ideo30.user_count.sum())/4  - 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "pages_info['page_ideo_corr'] = pages_info['page_ideo'].apply(lambda x: x-corr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## B. Homogeneity per page"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### a. Entropy per page\n",
    "- The entropy per page is calculated as entropy = -p1log(p1) - p2log(p2) - p3log(p3) with p1 the ratio of liberals that liked te page, p2 the ratio of moderates and p3 the ratio of conservatives. In the case of three classes, entropy ranges from close to zero at minimal disorder (the page is liked by almost exclusively members of the same class) to 1.58 at maximum disorder (i.e. the classes are balanced with 33% class liberal, 33% class moderate and 33% class conservative). We will rescale entropy between 0 and 1 by dividing it by 1.58.\n",
    "See appendix D.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define function for entropy term\n",
    "def entr_term(ratio):\n",
    "    if ratio == 0:\n",
    "        return 0\n",
    "    else:\n",
    "        return -(ratio)*(np.log2(ratio))  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate entropy for all pages with minimum 30 likes\n",
    "ideology = [] # empty list for ideology\n",
    "entropy = [] # empty list for entropy\n",
    "for page in pages_30:\n",
    "    userlist = user_likes[user_likes['page_name'] == page]['resp_id'].tolist() # users that liked page\n",
    "    ideos = user_ideo[user_ideo['resp_id'].isin(userlist)]['ideo'].tolist() # ideology of users that liked page\n",
    "    \n",
    "    left_ratio = len([i for i in ideos if i < 2])/len(ideos) # calculate ratio of liberal users\n",
    "    center_ratio = len([i for i in ideos if i == 2])/len(ideos) # ratio of moderate users\n",
    "    right_ratio = len([i for i in ideos if i in([3,4])])/len(ideos) # ratio of conservative users\n",
    "    entr = entr_term(left_ratio)+entr_term(center_ratio)+entr_term(right_ratio) # calculate entropy\n",
    "    entr_scaled = entr/1.58 # scale between 0 and 1 (divide by max entropy)\n",
    "    entropy.append(entr_scaled) # append scaled entropy to list\n",
    "\n",
    "page_entr = pd.DataFrame(list(zip(pages_30, entropy)), columns=['page_name', 'entropy']) # create dataframe with page_name and entropy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge page entropy to pages_info\n",
    "pages_info = pages_info.merge(page_entr, on='page_name')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### b. Cramer's V per page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# only consider users with ideology score (so no \"unsure\")\n",
    "user_with_ideo = user_ideo[~(user_ideo['ideo'].isnull())] # select users with ideology score\n",
    "user_likes_with_ideo = user_likes[user_likes['resp_id'].isin(user_with_ideo['resp_id'].tolist())] # match with facebook likes data\n",
    "user_with_ideo = user_likes_with_ideo.resp_id.unique() # users with ideology score and that match with facebook likes data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# user data for users with ideology score\n",
    "user_with_ideo_ideo = user_ideo[(user_ideo['resp_id'].isin(user_with_ideo))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate chisquare for all pages (with minimum x likes)\n",
    "from scipy.stats import chi2_contingency\n",
    "ideology = [] # empty list for ideology/pid\n",
    "chi_square = [] # empty list for homogeneity\n",
    "cramer_v=[]\n",
    "\n",
    "user_likes=user_likes_with_ideo # only consider users with ideology score (so no \"unsure\")\n",
    "\n",
    "for page in pages_30:\n",
    "    userlist = user_likes[user_likes['page_name'] == page]['resp_id'].tolist() # users that liked page\n",
    "    ideos = user_with_ideo_ideo[user_with_ideo_ideo['resp_id'].isin(userlist)]['ideo'].tolist() # ideology of users that liked page\n",
    "    ideos_neg = user_with_ideo_ideo[~(user_with_ideo_ideo['resp_id'].isin(userlist))]['ideo'].tolist() # ideology of users that did not like page\n",
    "\n",
    "    left_freq = len([i for i in ideos if i < 2]) # calculate frequency of likes by liberal users\n",
    "    center_freq = len([i for i in ideos if i == 2]) # frequency of likes by moderate users\n",
    "    right_freq = len([i for i in ideos if i > 2]) # frequency of likes by conservative users\n",
    "    left_freq_neg = len([i for i in ideos_neg if i < 2]) # calculate frequency of non-likes by liberal users\n",
    "    center_freq_neg = len([i for i in ideos_neg if i == 2]) # frequency of non-likes by moderate users\n",
    "    right_freq_neg = len([i for i in ideos_neg if i > 2]) # frequency of non-likes by conservative users\n",
    "    \n",
    "    obs=np.array([[left_freq,center_freq,right_freq],[left_freq_neg,center_freq_neg,right_freq_neg]])\n",
    "    \n",
    "    try:\n",
    "        chi2, p, dof, ex = chi2_contingency(obs, correction=False)\n",
    "    except: \n",
    "        chi2=np.nan\n",
    "    chi_square.append(chi2)\n",
    "    V = np.sqrt(chi2/(len(user_with_ideo)))\n",
    "    cramer_v.append(V)\n",
    "\n",
    "page_chi = pd.DataFrame(list(zip(pages_30, chi_square, cramer_v)), columns=['page_name', 'chi_square', 'cramer_v']) # create dataframe with page_name and entropy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge pages_info with chi_square\n",
    "pages_info = pages_info.merge(page_chi, on='page_name')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### c. variance per page\n",
    "See appendix D.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "page_var = pd.DataFrame(user_likes_ideo_30.groupby('page_name')['ideo'].var())\n",
    "page_var = page_var.reset_index().rename(columns={\"ideo\": \"variance\"})\n",
    "pages_info = pages_info.merge(page_var, on='page_name')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Homogeneity and page ideology per user\n",
    "average page ideology and page homogeneity per user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge user_likes and pages_info \n",
    "user_likes_pages_info = pages_info.merge(user_likes, how='left', on = 'page_name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate average homogeneity per user (only based on pages with minimum 30 likes!)\n",
    "for var in ['cramer_v','entropy','variance']:\n",
    "    user_var = pd.DataFrame(user_likes_pages_info.groupby(['resp_id'])[var].mean())\n",
    "    new_var_name = 'resp_'+var\n",
    "    user_var.rename(columns={var:new_var_name}, inplace=True)\n",
    "    user_var.reset_index(inplace=True)\n",
    "    user_ideo = user_ideo.merge(user_var, on='resp_id') # merge to user_ideo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate average page_ideology_corr per user (only based on pages with minimum 30 likes!)\n",
    "user_page_ideo_corr = pd.DataFrame(user_likes_pages_info.groupby(['resp_id'])['page_ideo_corr'].mean())\n",
    "user_page_ideo_corr.rename(columns={'page_ideo_corr':'resp_page_ideo_corr'}, inplace=True)\n",
    "user_page_ideo_corr.reset_index(inplace=True)\n",
    "user_info = user_ideo.merge(user_page_ideo_corr, on='resp_id') #merge to user_ideo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# also merge number of likes to user data\n",
    "user_info = user_info.merge(user_count, on='resp_id')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Homogeneity and page ideology per category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load categories and descriptions\n",
    "Cats = pd.read_excel('../data/Facebook_categories.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load coded pages\n",
    "pages_coded = pd.read_excel('../data/pages_coded.xlsx')\n",
    "pages_info = pages_info.merge(pages_coded, on='page_name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "pages_info.to_excel('../data/pages_info_coded.xlsx', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#create list with category names\n",
    "categories = Cats['Category'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "#count number of pages per categorie\n",
    "length=[]\n",
    "for t in categories:\n",
    "    group = pages_info[pages_info['category'].str.contains(t)]\n",
    "    length.append(len(group))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>categories</th>\n",
       "      <th>len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Shopping &amp; retail</td>\n",
       "      <td>1858</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Public Figures</td>\n",
       "      <td>876</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Food &amp; Beverage</td>\n",
       "      <td>837</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Entertainment</td>\n",
       "      <td>532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Music</td>\n",
       "      <td>499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Tv Shows</td>\n",
       "      <td>463</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>News &amp; Media</td>\n",
       "      <td>421</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Government &amp; Politics</td>\n",
       "      <td>353</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Movies</td>\n",
       "      <td>341</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Services</td>\n",
       "      <td>339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Beauty &amp; Health</td>\n",
       "      <td>337</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Civil Society</td>\n",
       "      <td>211</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Interests</td>\n",
       "      <td>166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Arts &amp; Culture</td>\n",
       "      <td>110</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Sports</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Cars and transportation</td>\n",
       "      <td>74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Identity &amp; Religion</td>\n",
       "      <td>72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Travel</td>\n",
       "      <td>70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Individual opinion leaders</td>\n",
       "      <td>51</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Research &amp; Education</td>\n",
       "      <td>43</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    categories   len\n",
       "0            Shopping & retail  1858\n",
       "1               Public Figures   876\n",
       "2              Food & Beverage   837\n",
       "6                Entertainment   532\n",
       "3                        Music   499\n",
       "4                     Tv Shows   463\n",
       "5                 News & Media   421\n",
       "7        Government & Politics   353\n",
       "8                       Movies   341\n",
       "10                    Services   339\n",
       "9              Beauty & Health   337\n",
       "11               Civil Society   211\n",
       "12                   Interests   166\n",
       "13              Arts & Culture   110\n",
       "14                      Sports    99\n",
       "15     Cars and transportation    74\n",
       "17         Identity & Religion    72\n",
       "16                      Travel    70\n",
       "18  Individual opinion leaders    51\n",
       "19        Research & Education    43"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Table 2: Facebook categories and the number of pages per category\n",
    "pd.DataFrame({'categories':categories,'len':length}).sort_values(by='len',ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "103"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of hardnews pages\n",
    "len(pages_info[pages_info['news2'] == 'hardnews'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add category to user_likes_pages_info\n",
    "user_likes_pages_info = user_likes_pages_info.merge(pages_info[['page_name','category','news2','news_politics','group']], how='left', on = 'page_name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Government & Politics: 285\n",
      "Political news: 188\n",
      "Hardnews: 103\n",
      "Lifestyle: 4730\n"
     ]
    }
   ],
   "source": [
    "# calculate average cramer v per user per group of Facebook pages\n",
    "def user_score(group, var, cat):\n",
    "    user_var = pd.DataFrame(group.groupby(['resp_id'])[var].mean())\n",
    "    user_var.rename(columns={var:cat}, inplace=True)\n",
    "    user_var.reset_index(inplace=True)\n",
    "    return user_var\n",
    "\n",
    "var = 'cramer_v'\n",
    "#user_info = pd.DataFrame(user_ids, columns={'resp_id'})\n",
    "\n",
    "#government and politics (not news)\n",
    "group = user_likes_pages_info[(user_likes_pages_info['category'].str.contains('Government'))&~(user_likes_pages_info['category'].str.contains('News'))]\n",
    "print('Government & Politics: ' + str(group['page_name'].nunique()))\n",
    "user_var = user_score(group, var, 'Politics')\n",
    "user_info = user_info.merge(user_var, on='resp_id', how='left')\n",
    "\n",
    "# political news\n",
    "group = user_likes_pages_info[user_likes_pages_info['news_politics'] == 'y']\n",
    "print('Political news: ' + str(group['page_name'].nunique()))\n",
    "user_var = user_score(group, var, 'Polnews')\n",
    "user_info = user_info.merge(user_var, on='resp_id', how='left')\n",
    "\n",
    "# hardnews\n",
    "group = user_likes_pages_info[user_likes_pages_info['news2'] == 'hardnews']\n",
    "print('Hardnews: ' + str(group['page_name'].nunique()))\n",
    "user_var = user_score(group, var, 'Hardnews')\n",
    "user_info = user_info.merge(user_var, on='resp_id', how='left')\n",
    "\n",
    "# lifestyle\n",
    "terms = 'Civil Society|Public Figures|Individual opinion leaders|Research & Education|Arts & Culture|Tv Shows|Entertainment|Movies|Interests|Music|Sports|Beauty & Health|Food & Beverage|Shopping & retail|Travel|Cars and transportation|Services|Identity & religion'\n",
    "group = user_likes_pages_info[(user_likes_pages_info['category'].str.contains(terms))&~(user_likes_pages_info['news2'].isin(['politics','hardnews']))]\n",
    "print('Lifestyle: ' + str(group['page_name'].nunique()))\n",
    "user_var = user_score(group, var, 'Lifestyle')\n",
    "user_info = user_info.merge(user_var, on='resp_id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_info.to_excel('../data/user_info.xlsx', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
